Source

knitr::opts_chunk$set(
    fig.align = "center",
    fig.pos = "h",
    message = FALSE,
    warning = FALSE,
    out.width = "90%",
    dpi = 300,
    cache = TRUE
)

library(dplyr)
library(ggplot2)
library(ggtext)
library(stringr)
library(tidyr)
library(ngram)
library(extrafont)
library(grid)
library(gridExtra)
loadfonts()

newtheme <- theme_minimal(base_family = "Candara")+
  theme(plot.title = element_markdown(size = 11, color = "grey30", margin = margin(b = -8)),
        plot.subtitle = element_text(size = 9.5, color = "grey40", hjust = 1),
        legend.position = "none",
        panel.grid = element_blank(),
        panel.spacing = unit(0, "mm"),
        panel.background = element_blank(),
        strip.background = element_blank(),
        strip.text = element_text(size = 8.5, margin = margin(t = 2), 
                                  color = "grey60", family = "Haettenschweiler"),
        axis.text = element_blank(),
        axis.title = element_blank())

Legendarium_titles <- c("The Hobbit",
                        "The Fellowship of the Ring",
                        "The Two Towers",
                        "The Return of the King",
                        "The Silmarillion*",
                        "The Children of Húrin*",
                        "Of Tuor and His Coming to Gondolin*")

load("data/Legendarium.RData")
readLines_wrap <- function(c) {
  readLines(paste0("data/", c), encoding = "UTF-8")%>%
  as.data.frame() %>% `colnames<-`("Text")
}

Hobbit_raw <- readLines_wrap("The Hobbit.txt")

LoTR_1_raw <- readLines_wrap("01 - The Fellowship Of The Ring.txt")

LoTR_2_raw <- readLines_wrap("02 - The Two Towers.txt")

LoTR_3_raw <- readLines_wrap("03 - The Return Of The King.txt")

Sil_raw <- readLines_wrap("The Silmarillion.txt")

CoH_raw <- readLines_wrap("The Children of Hurin.txt")

Tuor_raw <- readLines_wrap("Of Tuor And His Coming To Gondolin.txt")
## Functions for Data Cleaning
Tolkien_cleaning <- function(text_raw) {
  text_raw %>%
    mutate(Text = trimws(Text))%>%
    filter(Text != "")%>%
    mutate(Text = str_remove_all(Text, "\\f"))
}

Tolkien_summarise <- function(text) {
  text %>%
    group_by(Chapter) %>%
    mutate(Title = nth(Text, 2))%>%
    slice(-1)%>%
    filter(Text != Title, Text != as.character(Chapter))%>%
    summarise(Title = unique(Title),
              Text = paste(Text, collapse = " "),
              Word_Count = sum(wordcount(Text)))
}

Data Cleaning

## Hobbit --------------------
Hobbit <- Hobbit_raw %>%
  Tolkien_cleaning()%>%
  slice(which(str_detect(Text, "Chapter 1")):n())%>%
  mutate(Chapter =  findInterval(1:n(), which(str_detect(Text, "Chapter [0-9]+"))) %>%
         as.factor())%>%
  Tolkien_summarise()

## LoTR ----------------------
LoTR_cleaning <- function(text_raw, Volume) {
  text_raw %>%
    Tolkien_cleaning() %>%
    slice( min(which(str_detect(Text, "_Chapter 1_"))):
          (min(which(str_detect(Text, "Here ends the [a-z]+ part|======"))) - 1))%>%
    mutate(Book = ifelse(row_number() < max(which(Text == "_Chapter 1_")),
                         2 * Volume - 1, 2 * Volume),
           Book = as.roman(Book)) %>%
    group_by(Book) %>% 
    mutate(Chapter = paste0(Book, "-", findInterval(1:n(), which(str_detect(Text, "_Chapter [0-9]+_")))))%>%
    mutate(Chapter = factor(Chapter, levels = str_sort(unique(Chapter), numeric = TRUE))) %>%
    mutate(Text = str_remove_all(Text, "_"))%>%
    Tolkien_summarise()
}

LoTR_1 <- LoTR_cleaning(LoTR_1_raw, Volume = 1)
LoTR_2 <- LoTR_cleaning(LoTR_2_raw, Volume = 2)
LoTR_3 <- LoTR_cleaning(LoTR_3_raw, Volume = 3)

## The Silmarillion -----------------
Sil <- Sil_raw %>%
  Tolkien_cleaning()%>%
  slice( which(Text == "AINULINDALË"):
        max(which(Text == "NOTE ON PRONUNCIATION") - 1))%>%
  mutate(Chapter =  findInterval(1:n(), which(str_detect(Text, "Chapter [0-9]+"))))%>%
  mutate(Chapter = case_when(row_number() < which(Text == "VALAQUENTA") ~ "AINU", 
                             row_number() < which(Text == "QUENTA SILMARILLION") ~ "VALA", 
                             row_number() >= which(Text == "OF THE RINGS OF POWER AND THE THIRD AGE") ~ "RINGS",
                             row_number() >= which(Text == "AKALLABÊTH") ~ "AKALLABÊTH", 
                             TRUE ~ as.character(Chapter)))%>%
  filter(Chapter != "0") %>%
  mutate(Chapter = factor(Chapter, levels = c("AINU", "VALA", as.character(1:24),
                                              "AKALLABÊTH", "RINGS")))%>%
  Tolkien_summarise()

## The Children of Húrin ------------
CoH <- CoH_raw %>%
  Tolkien_cleaning()%>%
  filter(!str_detect(Text, "—"))%>%
  slice( which(str_detect(Text, "CHAPTER I")):
        (which(str_detect(Text, "APPENDIX")) - 1))%>%
  mutate(Chapter =  findInterval(1:n(), which(str_detect(Text, "CHAPTER "))))%>%
  mutate(Chapter = as.factor(Chapter))%>%
  Tolkien_summarise()

## Of Tuor And His Coming To Gondolin ------------
Tuor <- Tuor_raw %>%
  Tolkien_cleaning() %>%
  mutate(Chapter = "") %>%
  group_by(Chapter) %>%
  summarise(Text = paste(Text, collapse = " "))%>%
  mutate(Word_Count = wordcount(Text))%>%
  mutate(Chapter = as.factor(Chapter))
Legendarium_list <- list(Hobbit, LoTR_1, LoTR_2, LoTR_3, Sil, CoH, Tuor)%>%
  `names<-`(Legendarium_titles)

lex_width <- sapply(Legendarium_list, function(df) {sum(df$Word_Count)})%>%
    unname()
  
lex_matrix <- sapply(1:length(lex_width), function(x) {
    out <- rep(x, max(lex_width))
    if(x != which.max(lex_width)) {
      out[(lex_width[x]+1):max(lex_width)] <- NA}
    return(out)
  }) %>% t()
  
#save(Legendarium_list, lex_width, lex_matrix, file = "data/Legendarium.RData")

Lexical Dispersion Plot

Keyword_locate <- function(text, keyword_regex, exclude = NULL) {
  
  split <- paste0("(?<=", tolower(keyword_regex), ")")
  
  if(!is.null(exclude)) {
    split <- paste0(split, "(?<!", tolower(exclude), ")")
  }
  
  split <- paste0(split, "\\S*")
  
  loc <- tolower(text) %>%
    str_split(split) %>% 
    unlist %>% trimws%>%
    sapply(wordcount) %>% 
    unname()
  
  loc <- loc[-length(loc)] %>% cumsum
  
  if(length(loc) == 0) {NA}
  else {loc}
}

Keyword_df <- function(text_df, keyword_regex, exclude = NULL) {
  
  kword_loc <- sapply(text_df$Text, Keyword_locate, 
                      keyword_regex = keyword_regex,
                      exclude = exclude)
  if(typeof(kword_loc) == "list") { kword_times <- sapply(kword_loc, length) }
  else { kword_times <- sum(length(kword_loc)) }
  
  data.frame(Chapter = rep(text_df$Chapter, kword_times),
             Keyword_Loc = unlist(kword_loc) %>% unname,
             Keyword_Count = length(unlist(kword_loc)[!is.na(unlist(kword_loc))]))%>%
    mutate(Chapter = factor(Chapter, levels(text_df$Chapter))) %>%
    merge(text_df %>% select(-Text))%>%
    mutate()
}
## Lexical Dispersion Plot
lex_dispersion <- function(title, Keyword_Loc_list) {
  
  w_count <- formatC(sum(Legendarium_list[[title]]$Word_Count), 
                                format = "f", big.mark = ",", digits = 0)
  k_count <- Keyword_Loc_list[[title]][1, "Keyword_Count"]
  
  plot_title <- paste0("***", title, "*** - <span style='color:tomato4'>",
                       k_count, ifelse(k_count <= 1, " Time", " Times"), "</span>")
  plot_subtitle <- paste(w_count, "Words")

  if(title == tail(Legendarium_titles, 1)) {
    plot_title <- paste(plot_title, "<span style='font-family:Candara; font-size:9.5pt'>",
                        plot_subtitle, "</span>")
    plot_subtitle <- " "
  }

  p <- ggplot(Keyword_Loc_list[[title]]) +
    geom_rect(data = Legendarium_list[[title]] %>% 
                mutate(fill = (row_number() %% 2 == 1)),
              aes(xmin = 0, xmax = Word_Count, 
                  ymin = 0, ymax = 1, fill = fill))+
    labs(title = plot_title, subtitle = plot_subtitle)+
    scale_fill_manual(values = c("#d5d6d9", "#ededed"))+
    coord_cartesian(expand = FALSE)+
    facet_grid(~ Chapter, scales = "free_x", space = "free_x",
               switch = "x")+
    newtheme
  
  if(k_count > 0) {
    p <- p + geom_segment(aes(x = Keyword_Loc, xend = Keyword_Loc,
                              y = 0, yend = 1), 
                          size = 0.5, color = "tomato3",
                          alpha = 0.6)
    }
  

  p
}

Legendarium_lex_dispersion <- function(keyword, keyword_regex, exclude = NULL) {
  Keyword_Loc_list <- lapply(Legendarium_list, Keyword_df, 
                             keyword_regex = keyword_regex, 
                             exclude = exclude) %>%
  `names<-`(Legendarium_titles)


  lex_list <- lapply(Legendarium_titles, lex_dispersion, 
                     Keyword_Loc_list = Keyword_Loc_list)
  
  grid.arrange(
    grobs = lex_list,
    top = textGrob(c(toupper(paste0("The Appearance of the Word \"", 
                                  keyword, "\" in Tolkien's Legendarium\n")), 
                   "* Books Edited by Christopher Tolkien\n"),
                 x = c(0.01, 0.99), hjust = c(0, 1), 
                 gp = gpar(fontfamily = "Candara", 
                           col = c("grey25", "grey50"), lineheight = 0.3,
                           fontsize = c(13, 10.5), fontface = c("bold", "plain"))),
    bottom = textGrob("@akela", 
                 hjust = 1, vjust = 0, x = 0.99, y = 1,
                 gp = gpar(fontfamily = "Candara", col = "grey65", fontsize = 10)),
               layout_matrix = lex_matrix
    )
}

Keywords

Hope/Hopeful

Legendarium_lex_dispersion(
  keyword = "hope\", \"hopeful\" or \"estel",
  keyword_regex = "hope|estel",
  exclude = "no hope|hopeless")

Hopeless/No Hope/Desperate/Despair

Legendarium_lex_dispersion(
  keyword = "hopeless\", \"no hope\" or \"desperate",
  keyword_regex = "hopeless|desperate|no hope|despair")

#### Doom/Fate

Legendarium_lex_dispersion(
  keyword = "doom\" or \"fate",
  keyword_regex = "\\bdoom\\b|\\bdooms\\b|fate")

Tiding

Legendarium_lex_dispersion(
  keyword = "tiding",
  keyword_regex = "tiding")

Races

Hobbit/Halfling

Legendarium_lex_dispersion(
  keyword = "hobbit\" or \"halfling",
  keyword_regex = "hobbit|halfling")

Elf

Legendarium_lex_dispersion(
  keyword = "elf",
  keyword_regex = "\\belf\\b|\\belves\\b|\\belven\\b")

Dwarf

Legendarium_lex_dispersion(
  keyword = "dwarf",
  keyword_regex = "dwarf|dwarves|dwarven")

Orc/Goblin

Legendarium_lex_dispersion(
  keyword = "Orc\" or \"Goblin",
  keyword_regex = "\\borc\\b|\\borcs\\b|goblin")

Eagle

Legendarium_lex_dispersion(
  keyword = "eagle",
  keyword_regex = "eagle")

Numbers

3

Legendarium_lex_dispersion(
  keyword = "three\" or \"third" ,
  keyword_regex = "\\bthree\\b|\\bthird\\b")

7

Legendarium_lex_dispersion(
  keyword = "seven\" or \"seventh",
  keyword_regex = "\\bseven\\b|seventh")

9

Legendarium_lex_dispersion(
  keyword = "nine\" or \"ninth",
  keyword_regex = "\\bnine\\b|ninth")